O1 R Masters :)
source('notebooks/libraries.R')
raw_train_data<-fread('Data/BankCamp_train.csv', stringsAsFactors = F)
raw_test_data<-fread('Data/BankCamp_test.csv', stringsAsFactors = F)
str(raw_train_data)
## Classes 'data.table' and 'data.frame': 36168 obs. of 17 variables:
## $ age : int 50 47 56 36 41 32 26 60 39 55 ...
## $ job : chr "entrepreneur" "technician" "housemaid" "blue-collar" ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education: chr "primary" "secondary" "primary" "primary" ...
## $ default : chr "yes" "no" "no" "no" ...
## $ balance : int 537 -938 605 4608 362 0 782 193 2140 873 ...
## $ housing : chr "yes" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "unknown" "unknown" "cellular" "cellular" ...
## $ day : int 20 28 19 14 12 4 29 12 16 3 ...
## $ month : chr "jun" "may" "aug" "may" ...
## $ duration : int 11 176 207 284 217 233 297 89 539 131 ...
## $ campaign : int 15 2 6 7 3 3 1 2 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 276 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 2 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
## - attr(*, ".internal.selfref")=<externalptr>
str(raw_test_data)
## Classes 'data.table' and 'data.frame': 9043 obs. of 16 variables:
## $ age : int 58 43 51 56 32 54 58 54 32 38 ...
## $ job : chr "management" "technician" "retired" "management" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "primary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 593 229 779 23 529 -364 1291 0 424 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 55 353 164 160 1492 355 266 179 104 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## - attr(*, ".internal.selfref")=<externalptr>
discrete_var<-c("job", "marital", "education", "default", "housing", "loan","contact", "month", "poutcome")
continuous_var<-c("age", "balance","day", "duration", "campaign", "pdays", "previous")
target_dist<-ggplot(raw_train_data, aes(y, fill=y))+
geom_bar()+
scale_fill_manual(values=c("#995052", "#529950"))+
theme_minimal() +
xlab("target - campaign offer accepted")+
ggtitle("Distribution of target variable") +
theme(text = element_text(face = "bold"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
target_dist
cont_var <- as.data.frame(raw_train_data)
cont_var <- cont_var[ , (names(cont_var) %in% continuous_var)]
distribution <- as.data.frame(t(sapply(cont_var, quantile)))
distribution$Mean <- sapply(cont_var, mean)
distribution$SD <- sapply(cont_var, sd)
datatable(round(distribution, 2))
cont_var_melt <- as.data.frame(melt(cont_var))
cont_dist <- ggplot(cont_var_melt, aes(value)) +
geom_density(aes(fill = variable)) +
facet_wrap(~variable, scales = "free", nrow = 3) +
labs(x = "", y = "", fill = "") +
theme_minimal() +
scale_fill_tableau() +
ggtitle("Distribution of each continous variable") +
theme(text = element_text(face = "bold"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
cont_dist
## Create a new column in the test dataset
raw_test_data$y <- NA
## Creating a column "dataType" for both train and test datasets and assign the value 'train' & 'test'
raw_train_data$dataType <- "train"
raw_test_data$dataType <- "test"
## Merging both train and test datasets
dataset <- rbind(raw_train_data, raw_test_data)
ggplot(dataset, aes(x=age, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("Age Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
ggplot(dataset, aes(x=balance, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("Balance Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
ggplot(dataset, aes(x=day, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("Day Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
ggplot(dataset, aes(x=duration, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("Duration Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
ggplot(dataset, aes(x=campaign, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("Campaign Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
ggplot(dataset, aes(x=pdays, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("pdays Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
ggplot(dataset, aes(x=previous, color = dataType)) +
geom_density(alpha = 0.7) +
ggtitle("Previous Distribution") + theme_classic() +
scale_color_manual(values=c("#e08926", "#3526e0"))
df_disc <- raw_train_data[, ..discrete_var]
df_disc <- sapply(df_disc, as.factor)
df_disc <- as.data.frame(melt(df_disc))
disc_dist <- ggplot(df_disc, aes(value)) +
geom_bar(aes(fill = Var2)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
scale_x_discrete(expand = c(0,0)) +
facet_wrap(~Var2, scales = "free", nrow = 2) +
scale_fill_tableau() +
ggtitle("Count of each discrete variable") +
labs(fill = "", x = "", y = "") +
theme_minimal() +
theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
disc_dist
job_train <- ggplot(raw_train_data, aes(x=job)) +
geom_bar(fill ="#3526e0") +
ggtitle("Job Distribution - Train") + labs(y = "", x="") + theme_minimal() +
theme(text = element_text(face = "plain"), legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
job_test <- ggplot(raw_test_data, aes(x=job)) +
geom_bar(fill = "#e08926") + ggtitle("Job Distribution - Test") + labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(job_train, job_test, ncol=2)
marital_train <- ggplot(raw_train_data, aes(x=marital)) +
geom_bar(fill ="#3526e0") + ggtitle("Marital Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
marital_test <- ggplot(raw_test_data, aes(x=marital)) +
geom_bar(fill = "#e08926") + ggtitle("Marital Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(marital_train, marital_test, ncol=2)
education_train <- ggplot(raw_train_data, aes(x=education)) +
geom_bar(fill ="#3526e0") + ggtitle("Education Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
education_test <- ggplot(raw_test_data, aes(x=education)) +
geom_bar(fill = "#e08926") + ggtitle("Education Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(education_train, education_test, ncol=2)
default_train <- ggplot(raw_train_data, aes(x=default)) +
geom_bar(fill ="#3526e0") + ggtitle("Default Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
default_test <- ggplot(raw_test_data, aes(x=default)) +
geom_bar(fill = "#e08926") + ggtitle("Default Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(default_train, default_test, ncol=2)
housing_train <- ggplot(raw_train_data, aes(x=housing)) +
geom_bar(fill ="#3526e0") + ggtitle("Housing Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
housing_test <- ggplot(raw_test_data, aes(x=housing)) +
geom_bar(fill = "#e08926") + ggtitle("Housing Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(housing_train, housing_test, ncol=2)
loan_train <- ggplot(raw_train_data, aes(x=loan)) +
geom_bar(fill ="#3526e0") + ggtitle("Loan Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
loan_test <- ggplot(raw_test_data, aes(x=loan)) +
geom_bar(fill = "#e08926") + ggtitle("Loan Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(loan_train, loan_test, ncol=2)
contact_train <- ggplot(raw_train_data, aes(x=contact)) +
geom_bar(fill ="#3526e0") + ggtitle("Contact Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
contact_test <- ggplot(raw_test_data, aes(x=contact)) +
geom_bar(fill = "#e08926") + ggtitle("Contact Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(contact_train, contact_test, ncol=2)
month_train <- ggplot(raw_train_data, aes(x=month)) +
geom_bar(fill ="#3526e0") + ggtitle("Month Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
month_test <- ggplot(raw_test_data, aes(x=month)) +
geom_bar(fill = "#e08926") + ggtitle("Month Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(month_train, month_test, ncol=2)
poutcome_train <- ggplot(raw_train_data, aes(x=poutcome)) +
geom_bar(fill ="#3526e0") + ggtitle("poutcome Distribution - Train") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
poutcome_test <- ggplot(raw_test_data, aes(x=poutcome)) +
geom_bar(fill = "#e08926") + ggtitle("poutcome Distribution - Test") +
labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
legend.position = "none",
axis.text.x = element_text(size = 7, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5))
grid.arrange(poutcome_train, poutcome_test, ncol=2)
correlation <- cor(cont_var)
corrplot(correlation, method ="number", type = "upper")
cont_box <- ggplot(cont_var_melt, aes(variable, value)) +
geom_boxplot(aes(fill = variable)) +
coord_flip() +
scale_fill_tableau() +
labs(x = "", y = "") +
theme_minimal() +
theme(text = element_text(face = "bold"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank())
cont_box
df_cont_norm <- raw_train_data[,..continuous_var]
df_cont_norm <- as.data.frame(apply(df_cont_norm, 2,function(x)((x - min(x))/(max(x)-min(x)))))
df_cont_norm <- as.data.frame(melt(df_cont_norm))
cont_box_norm <- ggplot(df_cont_norm, aes(variable, value)) +
geom_boxplot(aes(fill = variable)) +
coord_flip() +
scale_fill_tableau() +
labs(x = "", y = "") +
theme_minimal() +
theme(text = element_text(face = "bold"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.x = element_blank())
cont_box_norm
disc_box <- ggplot(df_disc, aes(Var2, as.numeric(value))) +
geom_boxplot(aes(fill = Var2)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
scale_x_discrete(expand = c(0,0)) +
facet_wrap(~Var2, scales = "free", ncol = 1) +
scale_fill_tableau() +
ggtitle("Distribution of each discrete variable") +
labs(fill = "", x = "", y = "") +
coord_flip() +
theme_light() +
theme(text = element_text(face = "bold"),
legend.position = "none",
axis.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5),
strip.background = element_blank(),
strip.text.x = element_blank())
disc_box
ggplot(raw_train_data, aes(x=age, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("Age Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data, aes(x=balance, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("Balance Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data, aes(x=day, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("Day Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data, aes(x=duration, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("Duration Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data, aes(x=campaign, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("Campaign Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data, aes(x=pdays, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("pdays Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data, aes(x=previous, color = y)) +
geom_density(alpha = 0.7) +
ggtitle("Previous Target Distribution") + theme_classic() +
scale_color_manual(values=c("#995052", "#529950"))
ggplot(raw_train_data,mapping = aes(job,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(marital,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(education,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(default,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(housing,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(loan,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(contact,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(month,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)
ggplot(raw_train_data,mapping = aes(poutcome,fill=y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=3)